#’ Tasks
#’ 1. Carry an appropriate association rules mining, using the Apriori algorithm. Choose relevant minimum support and confidence values for this dataset. #’ 2. Select and comment a few rules that you may find more relevant, considering the main rules’ metrics: support, confidence, and lift.
#’ 3. Compile a commented report in PDF or HTML directly from RSudio.
#’ Note:
#’ When using a ‘wide’ dataset to create a transaction object, all #’
variables must either be logical or factor. Therefore, it is #’
recommended to: #’ #’ 1. Discretize all numerical variables (eg.
age). Discretization #’ can be carried with the function
discretize() from package arules. #’ 2.
Factorize all categorical or text variables. Factorization can be #’
carried with function factor from R base. #’ 3. Set
argument format = "wide" (default) when generating a #’
transactions object. Argument cols is not needed for this
format.
#Clening cache
#Load Library
## Carregando pacotes exigidos: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
Load dataset
adult_income <- read_xlsx("C:/Users/fabia/OneDrive/Área de Trabalho/machine learning R/adult_income.xlsx")#look your dados
## Rows: 48,842
## Columns: 10
## $ age <dbl> 25, 38, 28, 44, 18, 34, 29, 63, 24, 55, 65, 36, 26, 58,…
## $ workclass <chr> "Private", "Private", "Local-gov", "Private", NA, "Priv…
## $ education <chr> "11th", "HS-grad", "Assoc-acdm", "Some-college", "Some-…
## $ marital.status <chr> "Never-married", "Married-civ-spouse", "Married-civ-spo…
## $ occupation <chr> "Machine-op-inspct", "Farming-fishing", "Protective-ser…
## $ relationship <chr> "Own-child", "Husband", "Husband", "Husband", "Own-chil…
## $ race <chr> "Black", "White", "White", "Black", "White", "White", "…
## $ gender <chr> "Male", "Male", "Male", "Male", "Female", "Male", "Male…
## $ native.country <chr> "United-States", "United-States", "United-States", "Uni…
## $ income <chr> "<=50K", "<=50K", ">50K", ">50K", "<=50K", "<=50K", "<=…
#’ Note that this dataset is presented in a “long” (ie. horizontal) format
## # A tibble: 15 × 10
## age workclass education marital.status occupation relationship race gender
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 25 Private 11th Never-married Machine-o… Own-child Black Male
## 2 38 Private HS-grad Married-civ-s… Farming-f… Husband White Male
## 3 28 Local-gov Assoc-ac… Married-civ-s… Protectiv… Husband White Male
## 4 44 Private Some-col… Married-civ-s… Machine-o… Husband Black Male
## 5 18 <NA> Some-col… Never-married <NA> Own-child White Female
## 6 34 Private 10th Never-married Other-ser… Not-in-fami… White Male
## 7 29 <NA> HS-grad Never-married <NA> Unmarried Black Male
## 8 63 Self-emp… Prof-sch… Married-civ-s… Prof-spec… Husband White Male
## 9 24 Private Some-col… Never-married Other-ser… Unmarried White Female
## 10 55 Private 7th-8th Married-civ-s… Craft-rep… Husband White Male
## 11 65 Private HS-grad Married-civ-s… Machine-o… Husband White Male
## 12 36 Federal-… Bachelors Married-civ-s… Adm-cleri… Husband White Male
## 13 26 Private HS-grad Never-married Adm-cleri… Not-in-fami… White Female
## 14 58 <NA> HS-grad Married-civ-s… <NA> Husband White Male
## 15 48 Private HS-grad Married-civ-s… Machine-o… Husband White Male
## # ℹ 2 more variables: native.country <chr>, income <chr>
discretize()
#dados factor
adult_income3 <- lapply(adult_income, function(x) {
if(is.character(x) || is.factor(x)) {
return(factor(x))
} else {
return(x)
}
})#’ In this case, our dataset is presented in a “wide” format. #’ Therefore we must set argument ‘format’ to “wide”, #’ and name which columns contain ‘TransactionID’ and ‘Item’.
#' In this case, our dataset is presented in a "wide" format.
#' Therefore we must set argument 'format' to "wide",
#' and name which columns contain 'TransactionID' and 'Item'.
transactions <- arules::transactions(adult_income,
format = "wide")## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 not logical or factor.
## Applying default discretization (see '? discretizeDF').
## transactions in sparse format with
## 48842 transactions (rows) and
## 107 items (columns)
## transactions as itemMatrix in sparse format with
## 48842 rows (elements/itemsets/transactions) and
## 107 columns (items) and a density of 0.1015667
##
## most frequent items:
## native.country=United-States race=White
## 43832 41762
## income=<=50K workclass=Private
## 37155 33906
## gender=Male (Other)
## 32650 341492
##
## element (itemset/transaction) length distribution:
## sizes
## 8 9 10 11
## 46 2753 821 45222
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 11.00 11.00 10.87 11.00 11.00
##
## includes extended item information - examples:
## labels variables levels
## 1 age=[17,31) age [17,31)
## 2 age=[31,44) age [31,44)
## 3 age=[44,90] age [44,90]
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
#' Plot most frequent items in a barplot
itemFrequencyPlot(transactions,
topN = 15,
type = 'absolute',
horiz = T,
cex.names = 0.8)rules <- apriori(transactions,
parameter = list(supp = 0.01, # minimum Support
conf = 0.60, # minimum Confidence
minlen = 2)) # minimum length of items (lhs+rhs)## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 488
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[107 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [59 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(transactions, parameter = list(supp = 0.01, conf = 0.6, :
## Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!
## done [0.10s].
## writing ... [84887 rule(s)] done [0.01s].
## creating S4 object ... done [0.12s].
## set of 84887 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6 7 8 9 10
## 278 3042 11738 22461 24206 15623 6089 1327 123
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 5.000 6.000 5.718 7.000 10.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01001 Min. :0.6000 Min. :0.01001 Min. : 0.7138
## 1st Qu.:0.01267 1st Qu.:0.8229 1st Qu.:0.01448 1st Qu.: 1.0435
## Median :0.01732 Median :0.9204 Median :0.02011 Median : 1.2851
## Mean :0.02814 Mean :0.8881 Mean :0.03237 Mean : 1.7108
## 3rd Qu.:0.02905 3rd Qu.:0.9968 3rd Qu.:0.03307 3rd Qu.: 2.4070
## Max. :0.78811 Max. :1.0000 Max. :0.89742 Max. :20.6605
## count
## Min. : 489
## 1st Qu.: 619
## Median : 846
## Mean : 1375
## 3rd Qu.: 1419
## Max. :38493
##
## mining info:
## data ntransactions support confidence
## transactions 48842 0.01 0.6
## call
## apriori(data = transactions, parameter = list(supp = 0.01, conf = 0.6, minlen = 2))
## lhs rhs support confidence coverage lift count
## [1] {education=Doctorate} => {race=White} 0.01076942 0.8855219 0.01216166 1.0356463 526
## [2] {marital.status=Married-spouse-absent} => {income=<=50K} 0.01167028 0.9076433 0.01285779 1.1931399 570
## [3] {education=12th} => {workclass=Private} 0.01048278 0.7792998 0.01345154 1.1225908 512
## [4] {education=12th} => {income=<=50K} 0.01246878 0.9269406 0.01345154 1.2185072 609
## [5] {education=12th} => {race=White} 0.01054420 0.7838661 0.01345154 0.9167565 515
#' Sort rules by descending metric of interest,
#' and inspect the top n few.
rules <- sort(rules, by = 'lift', decreasing = TRUE)
# options(digits = 4) # to reduce the number of digits in the output
inspect(rules[1:15])## lhs rhs support confidence coverage lift count
## [1] {marital.status=Married-civ-spouse,
## gender=Female,
## native.country=United-States,
## income=>50K,
## age_discrete=[31,44)} => {relationship=Wife} 0.01011425 0.9860279 0.01025757 20.66048 494
## [2] {age=[31,44),
## marital.status=Married-civ-spouse,
## gender=Female,
## native.country=United-States,
## income=>50K} => {relationship=Wife} 0.01011425 0.9860279 0.01025757 20.66048 494
## [3] {age=[31,44),
## marital.status=Married-civ-spouse,
## gender=Female,
## native.country=United-States,
## income=>50K,
## age_discrete=[31,44)} => {relationship=Wife} 0.01011425 0.9860279 0.01025757 20.66048 494
## [4] {marital.status=Married-civ-spouse,
## gender=Female,
## income=>50K,
## age_discrete=[31,44)} => {relationship=Wife} 0.01124033 0.9821109 0.01144507 20.57840 549
## [5] {age=[31,44),
## marital.status=Married-civ-spouse,
## gender=Female,
## income=>50K} => {relationship=Wife} 0.01124033 0.9821109 0.01144507 20.57840 549
## [6] {age=[31,44),
## marital.status=Married-civ-spouse,
## gender=Female,
## income=>50K,
## age_discrete=[31,44)} => {relationship=Wife} 0.01124033 0.9821109 0.01144507 20.57840 549
## [7] {workclass=Private,
## marital.status=Married-civ-spouse,
## race=White,
## gender=Female,
## native.country=United-States,
## income=>50K} => {relationship=Wife} 0.01103558 0.9782214 0.01128127 20.49691 539
## [8] {marital.status=Married-civ-spouse,
## race=White,
## gender=Female,
## native.country=United-States,
## income=>50K} => {relationship=Wife} 0.01809918 0.9778761 0.01850866 20.48967 884
## [9] {marital.status=Married-civ-spouse,
## gender=Female,
## native.country=United-States,
## income=>50K} => {relationship=Wife} 0.01998280 0.9769770 0.02045371 20.47083 976
## [10] {workclass=Private,
## marital.status=Married-civ-spouse,
## gender=Female,
## native.country=United-States,
## income=>50K} => {relationship=Wife} 0.01199787 0.9766667 0.01228451 20.46433 586
## [11] {workclass=Private,
## marital.status=Married-civ-spouse,
## race=White,
## gender=Female,
## income=>50K} => {relationship=Wife} 0.01189550 0.9764706 0.01218214 20.46022 581
## [12] {marital.status=Married-civ-spouse,
## race=White,
## gender=Female,
## income=>50K} => {relationship=Wife} 0.01943000 0.9763374 0.01990090 20.45743 949
## [13] {workclass=Private,
## marital.status=Married-civ-spouse,
## race=White,
## gender=Female,
## native.country=United-States,
## age_discrete=[31,44)} => {relationship=Wife} 0.01044183 0.9751434 0.01070800 20.43241 510
## [14] {age=[31,44),
## workclass=Private,
## marital.status=Married-civ-spouse,
## race=White,
## gender=Female,
## native.country=United-States} => {relationship=Wife} 0.01044183 0.9751434 0.01070800 20.43241 510
## [15] {age=[31,44),
## workclass=Private,
## marital.status=Married-civ-spouse,
## race=White,
## gender=Female,
## native.country=United-States,
## age_discrete=[31,44)} => {relationship=Wife} 0.01044183 0.9751434 0.01070800 20.43241 510
#' Explore by searching specific items, either on
#' the Antecedent (lhs: left-hand side) or
#' the Consequent (rhs: right-hand side)
#'
#' Useful matching operators:
#' %in% : select itemsets matching any given item
#' %ain% : select itemsets matching ALL given items
#' %oin% : select itemsets matching ONLY the given item
#' %pin% : equivalent to %in% with PARTIAL matching
rules_filtered <- subset(rules, subset = rhs %pin% "heart")
inspect(rules_filtered)